Chapter 9 Functional Analysis
9.1 Functional overview
# A tibble: 93 × 2
genome n_genes
<chr> <int>
1 EHA00531_bin.1 4139
2 EHA00535_bin.6 3426
3 EHA00539_bin.6 3839
4 EHA00726_bin.13 3950
5 EHA00928_bin.90 3569
6 EHA01202_bin.66 3139
7 EHA01281_bin.18 3922
8 EHA01439_bin.44 3572
9 EHA01634_bin.14 3731
10 EHA01845_bin.103 3752
# ℹ 83 more rows
9.1.0.2 Number of annotated genes and percentages
#How many genes have at least 1 annotation
genome_annota <- genome_annotations %>%
filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
nrow()
cat(genome_annota)311777
[1] 83.81125
9.1.0.3 Number of KEGG annotatated genes and percentages
# KEGG annotation
kegg_annota <- genome_annotations %>%
filter(!is.na(kegg)) %>%
nrow()
cat(kegg_annota)243658
[1] 78.15137
# AMR annotation
amr_annota <- genome_annotations %>%
filter(resistance_type == "AMR") %>%
nrow()
cat(amr_annota)26372
[1] 8.45861
70648
[1] 22.65979
n_pred_genes <- genome_annotations %>%
group_by(mag_id) %>%
summarize(n_genes = n()) %>%
left_join(
genome_metadata %>% dplyr::select(ID, source),
by = c("mag_id" = "ID")
)
annotated_genes <- genome_annotations %>%
filter(if_any(c(kegg, pfam, cazy), ~ !is.na(.))) %>%
group_by(mag_id) %>%
summarize(n_annotated_genes = n()) %>%
left_join(
genome_metadata %>% dplyr::select(ID, source),
by = c("mag_id" = "ID")
)ggplot(n_pred_genes, aes(x= source, y = n_genes, fill = source))+
scale_fill_manual(values = source_colors)+
geom_boxplot()+
geom_point()+
theme_classic()+
labs(y = "Gene Number", x = "Source")ggplot(annotated_genes, aes(x= source, y = n_annotated_genes, fill = source))+
scale_fill_manual(values = source_colors)+
geom_boxplot()+
geom_point()+
theme_classic()+
labs(y = "Annotated Gene Number", x = "Source")# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 394. 0.0000776 Wilcoxon rank sum test with continuity correction two.sided
# A tibble: 1 × 4
statistic p.value method alternative
<dbl> <dbl> <chr> <chr>
1 350. 0.0000153 Wilcoxon rank sum test with continuity correction two.sided
9.2 KEGG
9.2.1 Nº of MAGs with KOs
#Proportion of MAGs belonging to each group per KEGG
#I want to know for example 5% of MAGs in the EHI group have this KEGG, but 40 % of MAGs in the GTDB group have it
# KEGG presence/absence
kegg_presence <- genome_annotations %>%
filter(mag_id != "GCA_015060925.1") %>% # remove weird MAG
filter(!is.na(kegg)) %>%
dplyr::select(mag_id, kegg) %>%
distinct()
#Add the source info
kegg_with_source <- kegg_presence %>%
left_join(
genome_metadata %>% dplyr::select(ID, source),
by = c("mag_id" = "ID")
)
# Count how many MAGs in each KEGG
kegg_mag_counts <- kegg_with_source %>%
group_by(source, kegg) %>%
summarise(
n_mags = n(),
.groups = "drop"
)
#Count total MAGs per source (except the outlier)
total_mags_per_source <- genome_metadata %>%
filter(ID != "GCA_015060925.1") %>%
group_by(source) %>%
summarise(
total_mags = n_distinct(ID),
.groups = "drop"
)
#Calculate proportions of MAGs from each source in each KEGG
kegg_mag_proportions <- kegg_mag_counts %>%
left_join(total_mags_per_source, by = "source") %>%
mutate(
proportion = n_mags / total_mags,
absent = total_mags - n_mags
)9.2.1.1 Plotting KEGG MAG proportions
# A tibble: 3,068 × 6
source kegg n_mags total_mags proportion absent
<chr> <chr> <int> <int> <dbl> <int>
1 EHI K00010 25 25 1 0
2 EHI K00012 24 25 0.96 1
3 EHI K00013 25 25 1 0
4 EHI K00014 22 25 0.88 3
5 EHI K00015 24 25 0.96 1
6 EHI K00018 22 25 0.88 3
7 EHI K00024 25 25 1 0
8 EHI K00027 23 25 0.92 2
9 EHI K00030 24 25 0.96 1
10 EHI K00033 25 25 1 0
# ℹ 3,058 more rows
ggplot(kegg_mag_proportions,
aes(x = kegg, y = proportion, fill = source)) +
geom_col(position = "dodge") +
scale_fill_manual(values = source_colors) +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, hjust = 1)
) +
labs(
x = "KEGG ortholog",
y = "Proportion of MAGs"
)9.2.1.2 Statistical testing of MAG proportions
fisher_results <- kegg_mag_proportions %>%
dplyr::select(kegg, source, n_mags, absent) %>%
pivot_wider(
names_from = source,
values_from = c(n_mags, absent),
values_fill = 0
) %>%
rowwise() %>%
mutate(
p_value = fisher.test(
matrix(
c(n_mags_EHI, absent_EHI,
n_mags_GTDB, absent_GTDB),
nrow = 2,
byrow = TRUE
)
)$p.value
) %>%
ungroup() %>%
mutate(p_adj = p.adjust(p_value, method = "BH"))
fisher_results <- fisher_results %>%
mutate(
prop_EHI = n_mags_EHI / (n_mags_EHI + absent_EHI),
prop_GTDB = n_mags_GTDB / (n_mags_GTDB + absent_GTDB),
diff_prop = prop_GTDB - prop_EHI
)# A tibble: 4 × 10
kegg n_mags_EHI n_mags_GTDB absent_EHI absent_GTDB p_value p_adj prop_EHI prop_GTDB diff_prop
<chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 K03205 3 40 22 28 0.0000486 0.0281 0.12 0.588 0.468
2 K06926 2 38 23 30 0.0000318 0.0245 0.08 0.559 0.479
3 K13525 14 68 11 0 0.0000000731 0.000169 0.56 1 0.44
4 K20331 13 64 12 4 0.0000118 0.0137 0.52 0.941 0.421
library(KEGGREST)
# Function to get info from KEGG
get_kegg_info <- function(ko_id) {
query <- keggGet(ko_id)[[1]]
# Extract Name and Definition
name <- ifelse(!is.null(query$NAME), query$NAME, "N/A")
definition <- ifelse(!is.null(query$DEFINITION), query$DEFINITION, "N/A")
# Extract Pathways (often multiple)
pathways <- if(!is.null(query$PATHWAY)) {
paste(names(query$PATHWAY), query$PATHWAY, sep = ": ", collapse = "; ")
} else {
"No pathway assigned"
}
return(c(KO = ko_id, Name = name, Definition = definition, Pathways = pathways))
}signigicant_kos <- fisher_results%>% filter(p_adj < 0.05) %>% pull(kegg)
ko_list <- lapply(signigicant_kos, get_kegg_info)
ko_summary <- as.data.frame(do.call(rbind, ko_list))
print(ko_summary) KO Name Definition
1 K03205 type IV secretion system protein VirD4 [EC:7.4.2.8] N/A
2 K06926 uncharacterized protein N/A
3 K13525 transitional endoplasmic reticulum ATPase N/A
4 K20331 toxoflavin synthase [EC:2.1.1.349] N/A
Pathways
1 map03070: Bacterial secretion system
2 No pathway assigned
3 map04137: Mitophagy - animal; map04141: Protein processing in endoplasmic reticulum; map05014: Amyotrophic lateral sclerosis; map05022: Pathways of neurodegeneration - multiple diseases; map05134: Legionellosis
4 map02024: Quorum sensing
fish_results_names <- left_join(sig_fisher_results, ko_summary, join_by(kegg== KO))
heatmap_data <- fish_results_names %>%
dplyr::select(Name, prop_EHI, prop_GTDB)%>%
pivot_longer(!Name, names_to = "source", values_to= "proportion" ) %>%
mutate(
source = case_when(
source == "prop_EHI" ~ "EHI",
source == "prop_GTDB" ~ "GTDB",
TRUE ~ source
)
)ggplot(heatmap_data,
aes(x = source, y = Name, fill = proportion)) +
geom_tile() +
scale_fill_viridis_c(
name = "Proportion of MAGs",
limits = c(0, 1)
) +
theme_minimal() +
labs(
x = "Source",
y = "KEGG ortholog",
title = "Differential KEGG presence between EHI and GTDB"
)9.2.2 Functional Ordination
PCA of KEGG annotations:
kegg_counts <- genome_annotations %>%
filter(mag_id != "GCA_015060925.1")%>% #this is the smaller mag that's weird
filter(!is.na(kegg)) %>%
dplyr::count(mag_id, kegg) %>%
pivot_wider(
names_from = kegg,
values_from = n,
values_fill = 0
)
# Normalization
kegg_rel <- kegg_counts %>%
column_to_rownames("mag_id")
kegg_rel <- kegg_rel / rowSums(kegg_rel) # Each row sums to 1
# Remove zero variance
kegg_rel_nz <- kegg_rel[, apply(kegg_rel, 2, sd) > 0]
# PCA with scaling
pca <- prcomp(kegg_rel_nz, scale. = TRUE)
# Check variance explained
summary(pca)$importance[,1:5] PC1 PC2 PC3 PC4 PC5
Standard deviation 17.94329 9.61480 8.85902 8.735097 8.450299
Proportion of Variance 0.13926 0.03998 0.03395 0.033000 0.030890
Cumulative Proportion 0.13926 0.17924 0.21319 0.246190 0.277080
country_palette <- c(
# Southern Europe
"Spain" = "#1B9E77",
"Italy" = "#33A02C",
"Greece" = "#66C2A5",
"Portugal" = "#2CA25F",
"Malta" = "#99D8C9",
# Northern/Central Europe
"Germany" = "#1F78B4",
"United Kingdom" = "#4A90E2",
"Ireland" = "#6BAED6",
# East Asia
"Japan" = "#E31A1C",
"South Korea" = "#FB6A4A",
"China" = "#CB181D",
# North America
"USA" = "#756BB1",
"Canada" = "#9E9AC8",
# Distinct
"Australia" = "#FFD92F",
"Greenland" = "#A6CEE3",
"none" = "grey70"
)scores <- as.data.frame(pca$x)
scores$ID <- rownames(scores)
scores <- scores %>%
left_join(genome_metadata, by = "ID")
ggplot(scores, aes(PC1, PC2, color = source)) +
geom_point(size = 2) +
scale_color_manual(values = source_colors)+
theme_minimal() +
labs(
title = "PCA of KEGG annotations across MAGs",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC2, PC3, color = source)) +
scale_color_manual(values = source_colors)+
geom_point(size = 2) +
theme_minimal() +
labs(
title = "P2 vs PC3 of KEGG annotations across MAGs",
x = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)"),
y = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)")
)ggplot(scores, aes(PC3, PC4, color = source)) +
scale_color_manual(values = source_colors)+
geom_point(size = 2) +
theme_minimal() +
labs(
title = "P3 vs PC4 of KEGG annotations across MAGs",
x = paste0("PC3 (", round(summary(pca)$importance[2,3] * 100, 1), "%)"),
y = paste0("PC4 (", round(summary(pca)$importance[2,4] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by genome size",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = completeness)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by completeness",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = host_species)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by host species ",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = country)) +
geom_point(size = 2) +
theme_minimal() +
scale_color_manual(values = country_palette, na.value = "grey70")+
labs(
title = "KEGG PCA colored by country ",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)ggplot(scores, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "KEGG PCA colored by continent",
x = paste0("PC1 (", round(summary(pca)$importance[2,1] * 100, 1), "%)"),
y = paste0("PC2 (", round(summary(pca)$importance[2,2] * 100, 1), "%)")
)loadings <- pca$rotation
abs_loadings <- apply(abs(loadings[, 1:2]), 1, sum)
top_combined <- sort(abs_loadings, decreasing = TRUE)[1:10]
top_combined K00428 K13663 K19783 K12035 K03310 K19234 K01442 K07012 K15971 K02574
0.09395610 0.09123926 0.08962148 0.08809128 0.08623159 0.08587434 0.08425436 0.08356048 0.08308556 0.08301173
library(KEGGREST)
top_loadings <- loadings %>%
as.data.frame() %>%
rownames_to_column("KEGG") %>%
arrange(desc(abs(PC1)))
head(top_loadings) KEGG PC1 PC2 PC3 PC4 PC5 PC6 PC7 PC8 PC9
1 K00318 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
2 K00603 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
3 K00602 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
4 K00241 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
5 K00760 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
6 K00761 0.05474364 -0.009694579 -0.004520008 0.001965835 -0.0091653 0.006068488 0.005395356 -0.003637749 -0.001571744
PC10 PC11 PC12 PC13 PC14 PC15 PC16 PC17 PC18
1 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
2 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
3 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
4 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
5 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
6 0.005902403 -0.004233131 -0.004894505 -0.003881218 0.001778101 0.004588241 0.004266847 0.004969778 -0.005891177
PC19 PC20 PC21 PC22 PC23 PC24 PC25 PC26 PC27
1 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
2 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
3 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
4 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
5 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
6 0.002499591 -0.002737706 -0.003074745 -0.0003239928 0.001770124 0.006645987 0.0004019688 -0.003059053 0.0006535857
PC28 PC29 PC30 PC31 PC32 PC33 PC34 PC35 PC36
1 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
2 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
3 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
4 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
5 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
6 -0.0005719754 0.001903491 0.0001743872 -0.001443894 -0.0002852472 -0.00147588 -0.002148302 -0.0004669545 -0.001198529
PC37 PC38 PC39 PC40 PC41 PC42 PC43 PC44 PC45
1 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
2 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
3 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
4 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
5 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
6 -0.002202629 0.001641412 0.001385291 -0.0006346985 -0.0006805273 0.0007669444 -0.003660098 -0.0004472639 -0.0006396953
PC46 PC47 PC48 PC49 PC50 PC51 PC52 PC53 PC54
1 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
2 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
3 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
4 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
5 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
6 -0.0002316659 -0.001496334 -0.001678697 0.001604079 -5.506554e-05 -0.002389476 0.001461016 -0.0009120602 -0.0009106807
PC55 PC56 PC57 PC58 PC59 PC60 PC61 PC62 PC63
1 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
2 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
3 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
4 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
5 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
6 -0.00153743 0.0007363891 0.001999625 0.002177865 -0.002292433 0.003403386 -0.001181503 5.05191e-05 0.00147613
PC64 PC65 PC66 PC67 PC68 PC69 PC70 PC71 PC72
1 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
2 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
3 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
4 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
5 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
6 -0.0008828989 0.0004442235 0.003283536 0.0001249199 -0.0004132665 0.002572613 0.001327565 0.001791235 0.0003839878
PC73 PC74 PC75 PC76 PC77 PC78 PC79 PC80 PC81
1 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
2 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
3 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
4 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
5 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
6 -0.001050797 0.0008920328 0.0009844757 -0.00161418 -0.001416298 -0.00276859 0.001776861 -0.0002906129 0.002162745
PC82 PC83 PC84 PC85 PC86 PC87 PC88 PC89 PC90
1 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
2 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
3 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
4 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
5 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
6 0.001960269 -0.0005624031 -0.0003303478 0.001465212 0.001264419 0.0008391462 -0.0004377041 0.001767447 8.081168e-05
PC91 PC92 PC93
1 -0.003479609 0.0005672297 -0.060196326
2 -0.003479609 0.0005672297 -0.028827514
3 -0.003479609 0.0005672297 -0.015508586
4 -0.003479609 0.0005672297 -0.007955012
5 -0.003479609 0.0005672297 0.005988844
6 -0.003479609 0.0005672297 0.005988844
top_kos <- head(top_loadings)%>%pull(KEGG)
ko_pathways <- lapply(top_kos, function(k) {
info <- keggGet(paste0("ko:", k))[[1]]
pathways <- info$PATHWAY
if(is.null(pathways)) pathways <- NA
return(pathways)
})
names(ko_pathways) <- top_kos
ko_pathways$K00318
map00330 map01100 map01110
"Arginine and proline metabolism" "Metabolic pathways" "Biosynthesis of secondary metabolites"
$K00603
map00340 map00670 map01100
"Histidine metabolism" "One carbon pool by folate" "Metabolic pathways"
$K00602
map00230 map00670 map01100
"Purine metabolism" "One carbon pool by folate" "Metabolic pathways"
map01110 map01523
"Biosynthesis of secondary metabolites" "Antifolate resistance"
$K00241
map00020 map00190
"Citrate cycle (TCA cycle)" "Oxidative phosphorylation"
map00650 map00720
"Butanoate metabolism" "Other carbon fixation pathways"
map01100 map01110
"Metabolic pathways" "Biosynthesis of secondary metabolites"
map01120 map01200
"Microbial metabolism in diverse environments" "Carbon metabolism"
$K00760
map00230 map00983 map01100
"Purine metabolism" "Drug metabolism - other enzymes" "Metabolic pathways"
map01110 map01232
"Biosynthesis of secondary metabolites" "Nucleotide metabolism"
$K00761
map00240 map01100 map01232
"Pyrimidine metabolism" "Metabolic pathways" "Nucleotide metabolism"
9.2.3 KEGG heatmap
# Relative abundance heatmap
pheatmap(kegg_rel_nz,
color = viridis(100, option = "viridis"),
cluster_rows = TRUE,
cluster_cols = TRUE,
fontsize = 9,
border_color = NA
)# For scaled heatmap -> remove zero variance columns
col_vars <- apply(kegg_rel_nz, 2, var, na.rm = TRUE)
matrix_filtered <- kegg_rel_nz[, col_vars > 0 & !is.na(col_vars)]
# Scale the filtered matrix
scaled <- scale(matrix_filtered, center = TRUE, scale = TRUE)
# Check for any remaining Inf/NaN values
if(any(!is.finite(scaled))) {
scaled[!is.finite(scaled)] <- 0 # Replace Inf/NaN with 0
}
# Scaled heatmap
pheatmap(scaled,
color = viridis(100, option = "viridis"),
cluster_rows = TRUE,
cluster_cols = TRUE,
fontsize = 5,
border_color = NA
)9.2.4 Testing the differences in KEGG annotations with PERMANOVA
# Distance matrix from normalized data
kegg_dist <- vegdist(kegg_rel_nz, method = "bray")
# Add metadata
kegg_rel_nz_meta <- kegg_rel_nz %>%
rownames_to_column("ID") %>%
left_join(genome_metadata, by = "ID")
#Beta dispersion test
dispersion <- betadisper(kegg_dist, kegg_rel_nz_meta$source)
anova(dispersion) Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.005554 0.0055536 9.9304 0.002201 **
Residuals 91 0.050891 0.0005592
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA test
permanova_result <- adonis2(kegg_dist ~ genome_size + source,
data = kegg_rel_nz_meta,
permutations = 999)
print(permanova_result)Permutation test for adonis under reduced model
Permutation: free
Number of permutations: 999
adonis2(formula = kegg_dist ~ genome_size + source, data = kegg_rel_nz_meta, permutations = 999)
Df SumOfSqs R2 F Pr(>F)
Model 2 0.20587 0.2843 17.876 0.001 ***
Residual 90 0.51825 0.7157
Total 92 0.72412 1.0000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
adonis2(
kegg_dist ~ genome_size + source,
data = kegg_rel_nz_meta,
permutations = 999,
by = "margin"
)Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = kegg_dist ~ genome_size + source, data = kegg_rel_nz_meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.14688 0.20284 25.5069 0.001 ***
source 1 0.01024 0.01415 1.7789 0.049 *
Residual 90 0.51825 0.71570
Total 92 0.72412 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
kegg_counts_mat <- kegg_counts %>%
column_to_rownames("mag_id")
kegg_pa <- (kegg_counts_mat > 0) * 1
# remove zero-variance KOs
kegg_pa_nz <- kegg_pa[, colSums(kegg_pa) > 0 & colSums(kegg_pa) < nrow(kegg_pa)]
meta <- genome_metadata %>%
filter(ID %in% rownames(kegg_pa_nz)) %>%
column_to_rownames("ID")
# VERY IMPORTANT: enforce same order
meta <- meta[rownames(kegg_pa_nz), ]
stopifnot(identical(rownames(meta), rownames(kegg_pa_nz)))
kegg_dist_pa <- vegdist(kegg_pa_nz, method = "jaccard", binary = TRUE)
# dispersion check
disp_pa <- betadisper(kegg_dist_pa, meta$source)
anova(disp_pa)Analysis of Variance Table
Response: Distances
Df Sum Sq Mean Sq F value Pr(>F)
Groups 1 0.027469 0.0274695 17.117 7.827e-05 ***
Residuals 91 0.146036 0.0016048
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# PERMANOVA
adonis2(
kegg_dist_pa ~ genome_size +completeness+ source,
data = meta,
permutations = 999, by = "margin"
)Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = kegg_dist_pa ~ genome_size + completeness + source, data = meta, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
genome_size 1 0.09249 0.04627 4.6819 0.001 ***
completeness 1 0.01738 0.00870 0.8798 0.611
source 1 0.04227 0.02115 2.1400 0.009 **
Residual 89 1.75813 0.87964
Total 92 1.99870 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Identify rows that have NO missing values
keep_indices <- complete.cases(meta[, c("continent", "genome_size", "completeness", "source")])
# Filter the metadata
meta_clean <- meta[keep_indices, ]
kegg_dist_mat <- as.matrix(kegg_dist_pa)
kegg_dist_clean <- as.dist(kegg_dist_mat[keep_indices, keep_indices])
adonis2(
kegg_dist_clean ~ country + genome_size + completeness,
data = meta_clean,
permutations = 999,
by = "margin"
)Permutation test for adonis under reduced model
Marginal effects of terms
Permutation: free
Number of permutations: 999
adonis2(formula = kegg_dist_clean ~ country + genome_size + completeness, data = meta_clean, permutations = 999, by = "margin")
Df SumOfSqs R2 F Pr(>F)
country 14 0.43481 0.28606 1.8155 0.001 ***
genome_size 1 0.04218 0.02775 2.4659 0.004 **
completeness 1 0.01180 0.00776 0.6896 0.835
Residual 53 0.90666 0.59647
Total 69 1.52004 1.00000
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
pcoa_pa <- cmdscale(kegg_dist_pa, eig = TRUE, k = 2)
pcoa_df <- data.frame(
ID = rownames(kegg_pa_nz),
PC1 = pcoa_pa$points[,1],
PC2 = pcoa_pa$points[,2],
source = meta$source
) %>%
left_join(genome_metadata, by = "ID")
ggplot(pcoa_df, aes(PC1, PC2, color = source.x)) +
geom_point(size = 3, alpha = 0.8) +
theme_classic()ggplot(pcoa_df, aes(PC1, PC2, color = completeness)) +
geom_point(size = 3, alpha = 0.8) +
theme_classic()ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 3, alpha = 0.8) +
theme_classic()9.2.5 Principal Coordinate Analysis - KEGG
pcoa_res <- cmdscale(kegg_dist, k = 2, eig = TRUE)
# Calculate variance explained
variance_explained <- pcoa_res$eig / sum(pcoa_res$eig)
pcoa_df <- data.frame(
PC1 = pcoa_res$points[,1],
PC2 = pcoa_res$points[,2],
ID = rownames(kegg_rel_nz)
) %>%
left_join(genome_metadata, by = "ID")
ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
scale_color_manual(values = source_colors)+
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG annotations across MAGs",
x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)ggplot(pcoa_df, aes(PC1, PC2, color = genome_size)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG annotations across MAGs",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)ggplot(pcoa_df, aes(PC1, PC2, color = host_species)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG annotations across MAGs (colored by host species)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
) ggplot(pcoa_df, aes(PC1, PC2, color = country)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG annotations across MAGs (colored by country)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
) ggplot(pcoa_df, aes(PC1, PC2, color = continent)) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG annotations across MAGs (colored by continent)",
x = paste0("PC1 (",round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
)library(patchwork)
p_main <- ggplot(pcoa_df, aes(PC1, PC2, color = source)) +
scale_color_manual(values = source_colors) +
geom_point(size = 2) +
theme_minimal() +
labs(
title = "PCoA of KEGG annotations across MAGs",
x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)"),
y = paste0("PC2 (", round(variance_explained[2] * 100, 1), "%)")
) +
theme(
legend.position = "right",
axis.title.x = element_blank(),
axis.text.x = element_blank(),
axis.ticks.x = element_blank(),
plot.margin = margin(5, 5, 0, 5)
)
cor_value <- cor(pcoa_df$PC1, pcoa_df$completeness, method = "spearman")
cor_pvalue <- cor.test(pcoa_df$PC1, pcoa_df$completeness,
method = "spearman")$p.valueWarning in cor.test.default(pcoa_df$PC1, pcoa_df$completeness, method = "spearman"): Cannot compute exact p-value with
ties
cor_text <- paste0("Spearman ρ = ", round(cor_value, 3),
"\np < ", format.pval(cor_pvalue, digits = 2))
p_completeness_annotated <- ggplot(pcoa_df, aes(x = PC1, y = completeness,
color = source)) +
scale_color_manual(values = source_colors) +
geom_point(alpha = 0.6, size = 2) +
geom_smooth(method = "lm", se = TRUE, color = "black", linewidth = 1) +
annotate("text",
x = min(pcoa_df$PC1) + 0.1 * diff(range(pcoa_df$PC1)),
y = max(pcoa_df$completeness) - 5,
label = cor_text,
hjust = 0,
size = 3, # Changed from 4 to 3 (smaller)
fontface = "plain") + # Changed from "bold" to "plain"
theme_minimal() +
labs(
y = "Completeness (%)",
x = paste0("PC1 (", round(variance_explained[1] * 100, 1), "%)")
) +
theme(
legend.position = "none",
plot.margin = margin(0, 5, 5, 5),
panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5) # Add border
)
p_main <- p_main +
theme(panel.border = element_rect(color = "black", fill = NA, linewidth = 0.5))
combined_annotated <- p_main / p_completeness_annotated +
plot_layout(heights = c(3, 1), guides = "collect")
print(combined_annotated)`geom_smooth()` using formula = 'y ~ x'
9.2.6 Differential abundance
Differential expression analysis- trying to find which KEGG orthologs are significantly different between EHI vs GTDB DESeq2
Cargando paquete requerido: S4Vectors
Cargando paquete requerido: stats4
Cargando paquete requerido: BiocGenerics
Adjuntando el paquete: 'BiocGenerics'
The following object is masked from 'package:gridExtra':
combine
The following object is masked from 'package:tinytable':
colnames
The following objects are masked from 'package:lubridate':
intersect, setdiff, union
The following objects are masked from 'package:dplyr':
combine, intersect, setdiff, union
The following objects are masked from 'package:stats':
IQR, mad, sd, var, xtabs
The following objects are masked from 'package:base':
anyDuplicated, aperm, append, as.data.frame, basename, cbind, colnames, dirname, do.call, duplicated,
eval, evalq, Filter, Find, get, grep, grepl, intersect, is.unsorted, lapply, Map, mapply, match, mget,
order, paste, pmax, pmax.int, pmin, pmin.int, Position, rank, rbind, Reduce, rownames, sapply, setdiff,
table, tapply, union, unique, unsplit, which.max, which.min
Adjuntando el paquete: 'S4Vectors'
The following objects are masked from 'package:Matrix':
expand, unname
The following object is masked from 'package:ggtree':
expand
The following objects are masked from 'package:lubridate':
second, second<-
The following objects are masked from 'package:dplyr':
first, rename
The following object is masked from 'package:tidyr':
expand
The following object is masked from 'package:utils':
findMatches
The following objects are masked from 'package:base':
expand.grid, I, unname
Cargando paquete requerido: IRanges
Adjuntando el paquete: 'IRanges'
The following object is masked from 'package:webshot':
resize
The following object is masked from 'package:rstatix':
desc
The following object is masked from 'package:nlme':
collapse
The following object is masked from 'package:ggtree':
collapse
The following object is masked from 'package:phyloseq':
distance
The following object is masked from 'package:lubridate':
%within%
The following objects are masked from 'package:dplyr':
collapse, desc, slice
The following object is masked from 'package:purrr':
reduce
The following object is masked from 'package:R.oo':
trim
The following object is masked from 'package:grDevices':
windows
Cargando paquete requerido: GenomicRanges
Cargando paquete requerido: GenomeInfoDb
Cargando paquete requerido: SummarizedExperiment
Cargando paquete requerido: MatrixGenerics
Cargando paquete requerido: matrixStats
Warning: package 'matrixStats' was built under R version 4.4.3
Adjuntando el paquete: 'matrixStats'
The following object is masked from 'package:dplyr':
count
Adjuntando el paquete: 'MatrixGenerics'
The following objects are masked from 'package:matrixStats':
colAlls, colAnyNAs, colAnys, colAvgsPerRowSet, colCollapse, colCounts, colCummaxs, colCummins,
colCumprods, colCumsums, colDiffs, colIQRDiffs, colIQRs, colLogSumExps, colMadDiffs, colMads, colMaxs,
colMeans2, colMedians, colMins, colOrderStats, colProds, colQuantiles, colRanges, colRanks, colSdDiffs,
colSds, colSums2, colTabulates, colVarDiffs, colVars, colWeightedMads, colWeightedMeans,
colWeightedMedians, colWeightedSds, colWeightedVars, rowAlls, rowAnyNAs, rowAnys, rowAvgsPerColSet,
rowCollapse, rowCounts, rowCummaxs, rowCummins, rowCumprods, rowCumsums, rowDiffs, rowIQRDiffs,
rowIQRs, rowLogSumExps, rowMadDiffs, rowMads, rowMaxs, rowMeans2, rowMedians, rowMins, rowOrderStats,
rowProds, rowQuantiles, rowRanges, rowRanks, rowSdDiffs, rowSds, rowSums2, rowTabulates, rowVarDiffs,
rowVars, rowWeightedMads, rowWeightedMeans, rowWeightedMedians, rowWeightedSds, rowWeightedVars
Cargando paquete requerido: Biobase
Welcome to Bioconductor
Vignettes contain introductory material; view with 'browseVignettes()'. To cite Bioconductor, see
'citation("Biobase")', and for packages 'citation("pkgname")'.
Adjuntando el paquete: 'Biobase'
The following object is masked from 'package:MatrixGenerics':
rowMedians
The following objects are masked from 'package:matrixStats':
anyMissing, rowMedians
The following object is masked from 'package:phyloseq':
sampleNames
# Prepare the count matrix (KOs as rows, MAGs as columns)
# Ensure only integer columns are kept
counts_mtx <- kegg_counts %>%
column_to_rownames("mag_id") %>%
t()
# Prepare metadata (Must match column names of counts_mtx)
metadata <- genome_metadata %>%
filter(ID %in% colnames(counts_mtx)) %>%
column_to_rownames("ID")
# Ensure order matches exactly
metadata <- metadata[colnames(counts_mtx), , drop = FALSE]
#Create DESeq2 object
dds <- DESeqDataSetFromMatrix(countData = counts_mtx,
colData = metadata,
design = ~ source)Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in design formula are characters, converting to
factors
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
-- note: fitType='parametric', but the dispersion trend was not well captured by the
function: y = a/x + b, and a local regression fit was automatically substituted.
specify fitType='local' or 'mean' to avoid this message next time.
final dispersion estimates
fitting model and testing
# save results (Wildlife vs Human)
res <- results(dds, contrast=c("source", "EHI", "GTDB"), alpha=0.05)
summary(res)
out of 2312 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 0, 0%
LFC < 0 (down) : 10, 0.43%
outliers [1] : 0, 0%
low counts [2] : 82, 3.5%
(mean count < 0)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
ggplot(res_df,aes(x=baseMean, y=log2FoldChange)) +
geom_point(alpha=0.8) +
geom_smooth() +
scale_x_log10() +
geom_hline(yintercept = 0, alpha = 0.75,
color="red")+
theme_bw()+ coord_cartesian(ylim=c(-10, 10))`geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
res_df_2<- res_df%>%
mutate(label2= ifelse (abs(log2FoldChange) > 0.58 ,KO, ""))
ggplot(res_df_2, aes(x=baseMean,
y=log2FoldChange,
col=padj<0.05, label=label2)) +
geom_point(alpha=0.5) +
scale_x_log10() +
geom_hline(yintercept = 0, alpha = 0.75,
color="red")+
geom_text_repel()+
theme_bw()+ coord_cartesian(ylim=c(-10, 10))Warning: ggrepel: 113 unlabeled data points (too many overlaps). Consider increasing max.overlaps
ggplot(res_df_2, aes(x=log2FoldChange, y=-log10(pvalue), color=padj < 0.05, label=label2)) +
geom_point(alpha=0.5) +
geom_vline(xintercept = 1, alpha = 0.75,
linetype="dashed")+
geom_vline(xintercept = -1, alpha = 0.75,
linetype="dashed")+
geom_text_repel() +
theme_bw()Warning: ggrepel: 89 unlabeled data points (too many overlaps). Consider increasing max.overlaps
9.2.7 Wilcoxon
#Transform to long format and run test for every KO
wilcox_results <- kegg_rel_nz_meta %>%
pivot_longer(cols = starts_with("K"),
names_to = "KO",
values_to = "rel_abundance") %>%
group_by(KO) %>%
do(tidy(wilcox.test(rel_abundance ~ source, data = .))) %>%
ungroup()
# Adjust p-values
wilcox_results <- wilcox_results %>%
mutate(p.adj = p.adjust(p.value, method = "fdr")) %>%
filter(p.adj < 0.05) %>%
arrange(p.adj)
head(wilcox_results)# A tibble: 6 × 6
KO statistic p.value method alternative p.adj
<chr> <dbl> <dbl> <chr> <chr> <dbl>
1 K08138 1483 0.0000000422 Wilcoxon rank sum test with continuity correction two.sided 0.0000976
2 K00013 1331 0.0000313 Wilcoxon rank sum test with continuity correction two.sided 0.000189
3 K00024 1339 0.0000230 Wilcoxon rank sum test with continuity correction two.sided 0.000189
4 K00033 1325 0.0000392 Wilcoxon rank sum test with continuity correction two.sided 0.000189
5 K00052 1329 0.0000337 Wilcoxon rank sum test with continuity correction two.sided 0.000189
6 K00053 1344. 0.0000186 Wilcoxon rank sum test with continuity correction two.sided 0.000189
res_df <- as.data.frame(res) %>%
rownames_to_column("KO") %>%
filter(padj < 0.05) %>%
arrange(padj)
# Find KOs that are significant in BOTH tests
common_KOs <- intersect(res_df$KO, wilcox_results$KO)
# Boxplot of the top hit
top_ko <- common_KOs[1]
ggplot(kegg_rel_nz_meta, aes(x = source, y = .data[[top_ko]], fill = source)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.5) +
theme_minimal() +
labs(title = paste("Distribution of", top_ko),
y = "Relative Abundance")# Filter data to only include the common KOs
plot_data <- kegg_rel_nz_meta %>%
dplyr::select(ID, source, all_of(common_KOs)) %>%
pivot_longer(cols = all_of(common_KOs),
names_to = "KO",
values_to = "Relative_Abundance")
ggplot(plot_data, aes(x = source, y = Relative_Abundance, fill = source)) +
geom_boxplot(outlier.shape = NA) +
geom_jitter(width = 0.2, alpha = 0.4, size = 1) +
facet_wrap(~KO, scales = "free_y", ncol = 5) + # 'free_y' is important as abundance scales vary
scale_fill_manual(values = source_colors) +
theme_minimal() +
theme(strip.text = element_text(face = "bold"),
legend.position = "bottom") +
labs(title = "Relative Abundance of Consensus Significant KOs",
x = "Host Source",
y = "Relative Abundance")Check alignment with DESeq:
# Calculate medians for each group to see direction
direction_check <- kegg_rel_nz_meta %>%
dplyr::select(source, all_of(common_KOs)) %>%
pivot_longer(-source, names_to = "KO", values_to = "val") %>%
group_by(KO, source) %>%
summarize(median_val = median(val), .groups = 'drop') %>%
pivot_wider(names_from = source, values_from = median_val) %>%
mutate(Enriched_In = ifelse(GTDB > EHI, "GTDB", "EHI"))
# Merge with your DESeq2 results to see if they align
final_comparison <- res_df %>%
filter(KO %in% common_KOs) %>%
dplyr::select(KO, log2FoldChange, padj) %>%
left_join(direction_check, by = "KO")
print(final_comparison) KO log2FoldChange padj EHI GTDB Enriched_In
1 K00754 -0.7479886 7.909738e-05 0.0031446541 0.0056385200 GTDB
2 K12063 -2.8303552 2.317878e-04 0.0000000000 0.0007237285 GTDB
3 K03496 -1.5234980 8.727123e-04 0.0004004806 0.0007883328 GTDB
4 K04763 -0.5721473 8.727123e-04 0.0072086504 0.0113583996 GTDB
5 K03205 -2.7791700 8.727123e-04 0.0000000000 0.0003911803 GTDB
6 K02316 -0.8913735 1.094241e-02 0.0008285004 0.0015585457 GTDB
7 K01185 -2.0946452 4.321013e-02 0.0000000000 0.0003613384 GTDB
8 K16695 -0.8284503 4.811695e-02 0.0008009612 0.0014506351 GTDB
9 K14059 -1.0656248 4.916919e-02 0.0003895598 0.0010485845 GTDB
10 K00986 -1.7702579 4.916919e-02 0.0000000000 0.0003677157 GTDB
target_kos <- common_KOs
ko_info_list <- lapply(target_kos, get_kegg_info)
ko_summary <- as.data.frame(do.call(rbind, ko_info_list))
print(ko_summary) KO Name Definition Pathways
1 K00754 L-malate glycosyltransferase [EC:2.4.1.-] N/A No pathway assigned
2 K12063 conjugal transfer ATP-binding protein TraC N/A No pathway assigned
3 K03496 chromosome partitioning protein N/A No pathway assigned
4 K04763 integrase/recombinase XerD N/A No pathway assigned
5 K03205 type IV secretion system protein VirD4 [EC:7.4.2.8] N/A map03070: Bacterial secretion system
6 K02316 DNA primase [EC:2.7.7.101] N/A map03030: DNA replication
7 K01185 lysozyme [EC:3.2.1.17] N/A No pathway assigned
8 K16695 lipopolysaccharide exporter N/A No pathway assigned
9 K14059 integrase N/A No pathway assigned
10 K00986 RNA-directed DNA polymerase [EC:2.7.7.49] N/A No pathway assigned
#Prepare the matrix (KOs as rows, MAGs as columns)
heatmap_mat <- kegg_rel_nz_meta %>%
column_to_rownames("ID") %>%
dplyr::select(all_of(common_KOs)) %>%
as.matrix() %>%
t()
# Prepare the annotation data frame
annotation_col <- data.frame(Source = kegg_rel_nz_meta$source)
rownames(annotation_col) <- kegg_rel_nz_meta$ID
# Check if they match
all(colnames(heatmap_mat) == rownames(annotation_col))[1] TRUE
#colors
ann_colors <- list(
Source = source_colors
)
# Plot
pheatmap(heatmap_mat,
annotation_col = annotation_col,
annotation_colors = ann_colors,
scale = "row",
show_colnames = TRUE,
cluster_cols = TRUE,
main = "Consensus KOs: Human vs Wildlife")#Prepare the matrix (KOs as rows, MAGs as columns)
heatmap_mat <- kegg_rel_nz_meta %>%
column_to_rownames("ID") %>%
dplyr::select(all_of(wilcox_results$KO)) %>%
as.matrix() %>%
t()
# Prepare the annotation data frame
annotation_col <- data.frame(Source = kegg_rel_nz_meta$source)
rownames(annotation_col) <- kegg_rel_nz_meta$ID
# Check if they match
all(colnames(heatmap_mat) == rownames(annotation_col))[1] TRUE
#colors
ann_colors <- list(
Source = source_colors
)
# Plot
pheatmap(heatmap_mat,
annotation_col = annotation_col,
annotation_colors = ann_colors,
scale = "row",
show_colnames = TRUE,
cluster_cols = TRUE,
main = "Consensus KOs: Human vs Wildlife")9.2.8 ALDEX2
library(ALDEx2)
kegg_raw_mat <- kegg_counts %>%
column_to_rownames("mag_id") %>%
as.matrix()
kegg_raw_mat[is.na(kegg_raw_mat)] <- 0
conds <- kegg_rel_nz_meta %>%
filter(ID %in% rownames(kegg_raw_mat)) %>%
arrange(match(ID, rownames(kegg_raw_mat))) %>%
pull(source)
table(conds) conds
EHI GTDB
25 68
aldex_clr <- aldex.clr(
reads = t(kegg_raw_mat),
conds = conds,
mc.samples = 128, # number of Monte Carlo instances
denom = "all",
verbose = TRUE
)
# Welch's t-test and Wilcoxon test
aldex_test <- aldex.ttest(aldex_clr)
aldex_effect <- aldex.effect(aldex_clr)
# Combine results
aldex_res <- cbind(aldex_test, aldex_effect)
sig_kegg <- aldex_res %>%
as.data.frame() %>%
rownames_to_column("KEGG") %>%
filter(we.eBH < 0.05) %>% # FDR-corrected
arrange(we.eBH)
# Add direction of enrichment
sig_kegg <- sig_kegg %>%
mutate(enriched_in = ifelse(effect > 0, "GTDB", "EHI"))
sig_kegg [1] KEGG we.ep we.eBH wi.ep wi.eBH rab.all rab.win.EHI rab.win.GTDB diff.btw
[10] diff.win effect overlap enriched_in
<0 rows> (o 0- extensión row.names)
No significant results…
GENOME SIZE is significantly larger in GTDB MAGs –> correct for genome size
Which functions are enriched per unit genome by source?
genome_sizes <- genome_metadata %>%
dplyr::select(ID, genome_size)
kegg_counts_gs <- kegg_counts %>%
left_join(genome_sizes, by = c("mag_id" = "ID")) %>%
column_to_rownames("mag_id")
# Divide by genome size (in Mb)
kegg_per_mb <- kegg_counts_gs[, -ncol(kegg_counts_gs)] /
(kegg_counts_gs$genome_size / 1e6)kegg_mat <- as.matrix(kegg_per_mb)
# Replace any NA
kegg_mat[is.na(kegg_mat)] <- 0
#Filter rare keggs
keep_kegg <- colSums(kegg_mat > 0) >= 0.1 * nrow(kegg_mat)
kegg_mat <- kegg_mat[, keep_kegg]
#add a pseudocount
kegg_mat <- kegg_mat + 1e-6
conds <- kegg_rel_nz_meta %>%
filter(ID %in% rownames(kegg_mat)) %>%
arrange(match(ID, rownames(kegg_mat))) %>%
pull(source)
table(conds)conds
EHI GTDB
25 68
scale_factor <- 1e4
kegg_pseudo <- round(kegg_per_mb * scale_factor)
# Replace zeros (ALDEx2 hates structural zeros)
kegg_pseudo[kegg_pseudo == 0] <- 1aldex_clr <- aldex.clr(
reads = t(kegg_pseudo),
conds = conds,
mc.samples = 128,
denom = "all",
verbose = TRUE
)
aldex_test <- aldex.ttest(aldex_clr)
aldex_effect <- aldex.effect(aldex_clr)
aldex_res <- cbind(aldex_test, aldex_effect)
head(aldex_res) we.ep we.eBH wi.ep wi.eBH rab.all rab.win.EHI rab.win.GTDB diff.btw diff.win
K00010 0.0001063773 0.002512091 2.530797e-05 0.0001483258 7.224545 7.494664 7.090477 -0.37785129 0.4583032
K00012 0.6501288038 1.000000000 7.406534e-01 0.9197889601 5.248405 5.183798 5.315146 -0.02973917 0.8881005
K00013 0.0003129409 0.002713804 2.310542e-05 0.0001494921 3.175763 3.474600 3.095719 -0.35789699 0.5068449
K00014 0.1795102174 1.000000000 8.466026e-03 0.0159037771 3.147306 3.409503 3.095182 -0.24028803 0.5772888
K00015 0.6819146389 0.995012247 2.074072e-04 0.0005719904 3.154343 3.442476 3.086002 -0.32476494 0.5168753
K00018 0.1811793431 1.000000000 1.098270e-02 0.0203090202 3.148386 3.385027 3.093653 -0.22941452 0.5806484
effect overlap
K00010 -0.76745096 0.2092442
K00012 -0.01246316 0.4875000
K00013 -0.63201501 0.2081250
K00014 -0.11844158 0.3306250
K00015 -0.48066477 0.2437500
K00018 -0.11295111 0.3287500
sig_kegg <- aldex_res %>%
as.data.frame() %>%
rownames_to_column("KEGG") %>%
filter(
we.eBH < 0.05,
abs(effect) > 1
) %>%
arrange(desc(abs(effect)))
sig_kegg KEGG we.ep we.eBH wi.ep wi.eBH rab.all rab.win.EHI rab.win.GTDB diff.btw diff.win
1 K08138 1.487155e-05 0.002512091 1.922896e-08 4.445735e-05 3.227229 4.113634 3.109252 -0.8298336 0.8244235
effect overlap
1 -1.022404 0.11625
library(compositions)
clr_mat <- clr(kegg_per_mb + 1e-6)
res <- apply(clr_mat, 2, function(x) {
wilcox.test(x ~ conds)$p.value
})
res_adj <- p.adjust(res, method = "BH")clr_res <- data.frame(
KEGG = names(res),
p_value = res,
p_adj = res_adj
) %>%
arrange(p_adj)
head(clr_res) KEGG p_value p_adj
K08138 K08138 2.581324e-08 5.968022e-05
K00010 K00010 2.051803e-05 1.062537e-04
K00013 K00013 2.394559e-05 1.062537e-04
K00024 K00024 2.216885e-05 1.062537e-04
K00033 K00033 2.899917e-05 1.062537e-04
K00052 K00052 2.394559e-05 1.062537e-04
[1] 2011
res_lm <- apply(clr_mat, 2, function(x) {
model <- lm(x ~ conds + kegg_counts_gs$genome_size)
summary(model)$coefficients["condsGTDB","Pr(>|t|)"]
})
res_lm_adj <- p.adjust(res_lm, method = "BH")
sum(res_lm_adj < 0.05)[1] 3
clr_res_lm <- data.frame(
KEGG = colnames(clr_mat),
p_value = res_lm,
p_adj = res_lm_adj
)
sig_kegg <- clr_res_lm %>%
filter(p_adj < 0.05) %>%
arrange(p_adj)
sig_kegg$effect <- sapply(sig_kegg$KEGG, function(k) {
median(clr_mat[, k][conds == "GTDB"]) -
median(clr_mat[, k][conds == "EHI"])
})
sig_kegg <- sig_kegg %>%
arrange(desc(abs(effect)))
sig_kegg KEGG p_value p_adj effect
K00856 K00856 1.904299e-05 1.467579e-02 -0.436244787
K13530 K13530 1.886011e-05 1.467579e-02 -0.436244787
K13525 K13525 3.445901e-09 7.966923e-06 0.008659409
9.3 GIFTs
PCoA with Euclidean distances:
# Convert the GIFTs into a matrix of functional elements per genome (row)
gift_pcoa <- genome_gifts %>%
to.elements(., GIFT_db) %>%
as.data.frame() %>%
vegdist(method="euclidean") %>%
pcoa() #principal component analysis
#Extract eigenvalues (variance explained by first 10 axes)
gift_pcoa_rel_eigen <- gift_pcoa$values$Relative_eig[1:10]
# Get genome positions
gift_pcoa_vectors <- gift_pcoa$vectors %>% #extract vectors
as.data.frame() %>%
dplyr::select(Axis.1,Axis.2) # keep the first 2 axes
gift_pcoa_eigenvalues <- gift_pcoa$values$Eigenvalues[c(1,2)]
#For the black arrows: Functional group loadings
# covariance between each functional trait and pcoa axis scores
#scale with the eigenvectors
gift_pcoa_gifts <- cov(genome_gifts, scale(gift_pcoa_vectors)) %*%
diag((gift_pcoa_eigenvalues/(nrow(genome_gifts)-1))^(-0.5)) %>%
as.data.frame() %>%
dplyr::rename(Axis.1=1,Axis.2=2) %>%
rownames_to_column(var="label") %>%
#get function summary vectors
mutate(func=substr(label,1,3)) %>%
group_by(func) %>% #grouping by function
summarise(Axis.1=mean(Axis.1),
Axis.2=mean(Axis.2)) %>%
dplyr::rename(label=func) %>%
filter(!label %in% c("S01","S02","S03"))set.seed(101)
scale <- 20 # scale for vector loadings (to make arrows visible)
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
scale_color_manual(values=source_colors)+
geom_point(aes(x=Axis.1,y=Axis.2, color = source),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = completeness),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps
gift_pcoa_vectors %>%
rownames_to_column(var="ID") %>%
left_join(genome_metadata, by="ID") %>%
ggplot() +
#genome positions
geom_point(aes(x=Axis.1,y=Axis.2, color = continent),
alpha=0.9, shape=16) +
scale_size_continuous(range = c(0.1,5)) +
#loading positions
geom_segment(data=gift_pcoa_gifts,
aes(x=0, y=0, xend=Axis.1 * scale, yend=Axis.2 * scale),
arrow = arrow(length = unit(0.3, "cm"),
type = "open",
angle = 25),
linewidth = 0.5,
color = "black") +
#Primary and secondary scale adjustments
scale_x_continuous(name = paste0("PCoA1 (",round(gift_pcoa_rel_eigen[1]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA1")
) +
scale_y_continuous(name = paste0("PCoA2 (",round(gift_pcoa_rel_eigen[2]*100, digits = 2), " %)"),
sec.axis = sec_axis(~ . / scale, name = "Loadings on PCoA2")
) +
geom_label_repel(data = gift_pcoa_gifts,
aes(label = label, x = Axis.1 * scale, y = Axis.2 * scale),
segment.color = 'transparent') +
xlim(-0.8,1) +
ylim(-1,1.5) +
theme_minimal() +
labs(
x = paste0("PCoA1 (", round(gift_pcoa_rel_eigen[1]*100, 2), " %)"),
y = paste0("PCoA2 (", round(gift_pcoa_rel_eigen[2]*100, 2), " %)")
)Warning: Removed 1 row containing missing values or values outside the scale range (`geom_segment()`).
Warning: Removed 1 row containing missing values or values outside the scale range (`geom_label_repel()`).
Warning: ggrepel: 13 unlabeled data points (too many overlaps). Consider increasing max.overlaps
Using k-means to cluster the groups and check which MAGs cluster together:
coords <- gift_pcoa_vectors %>%
rownames_to_column("MAG") %>%
as_tibble()
set.seed(123)
km <- kmeans(coords[, c("Axis.1", "Axis.2")], centers = 4, nstart = 25, iter.max = 100)
coords <- coords %>% mutate(cluster = factor(km$cluster))# centroids as a tibble for plotting
centroids <- as_tibble(km$centers) %>% mutate(cluster = factor(1:nrow(km$centers)))
ggplot(coords, aes(x = Axis.1, y = Axis.2, color = cluster)) +
geom_point(size = 2, alpha = 0.8) +
geom_point(data = centroids, aes(x = Axis.1, y = Axis.2, color = cluster),
shape = 4, size = 5, stroke = 1.25) + # X marks centroids
theme_minimal() +
labs(title = paste0("PCoA colored by kmeans (k=4)"),
color = "cluster")$`1`
[1] "GCA_958435695.1" "EHM070114" "EHM049446" "GCF_001405935.1" "GCA_030844705.1" "EHM016582"
[7] "GCA_030839725.1" "EHM075512" "EHM048917" "GCF_001406015.1" "GCA_958447325.1" "GCA_958416015.1"
[13] "GCA_030839525.1" "EHM039583"
$`2`
[1] "GCA_022784765.1" "EHM049769" "GCF_003459965.1" "EHM069263" "EHM016228" "GCA_958416885.1"
[7] "GCA_008680675.1" "GCA_958404515.1" "GCA_030843875.1" "EHM020917" "GCA_022776965.1" "GCA_959028115.1"
[13] "GCA_009774835.1" "GCA_958413795.1" "GCA_959023275.1" "GCF_003474765.1" "GCF_029369685.1" "GCA_008679655.1"
$`3`
[1] "GCA_020809955.1" "GCA_014870645.1" "GCA_030842705.1" "GCA_959606485.1" "EHM027637" "GCF_012843465.1"
[7] "GCF_003469715.1" "GCF_003471825.1" "EHM046460" "GCF_003469775.1" "GCA_958422645.1" "GCA_022774455.1"
[13] "GCA_030842445.1" "GCA_958440685.1" "EHM016991" "GCF_018784195.1" "EHM067538" "GCF_001405775.1"
[19] "GCA_030844325.1" "GCA_025757725.1" "GCA_959600655.1" "GCA_958432605.1" "GCF_003437495.1" "GCF_003472045.1"
[25] "EHM064868" "GCF_020687325.1" "GCF_003468915.1" "GCF_003467575.1" "GCF_018292145.1" "GCF_003469235.1"
[31] "GCA_008679285.1" "GCA_958427815.1" "EHM047216" "GCA_958445425.1" "EHM023585" "GCF_002206325.1"
[37] "GCF_003468605.1" "GCA_958371025.1" "GCF_018288975.1" "EHM031743" "GCA_030841945.1" "GCA_959026965.1"
[43] "GCA_022771305.1" "GCA_003464145.1"
$`4`
[1] "EHM072417" "EHM029564" "GCF_019733675.1" "EHM049533" "GCF_018784125.1" "GCF_022835355.1"
[7] "GCF_003475725.1" "EHM028668" "GCF_018587995.1" "GCA_959026225.1" "GCF_022835335.1" "EHM048790"
[13] "EHM023781" "GCF_003464475.1" "GCF_018289335.1" "EHM016318" "GCF_003439415.1"
9.3.1 Checking each cluster at once
# A tibble: 14 × 25
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 EHM016582 s__Par… 92.2 1.17 3706610 45.7 7507 618 Strix aluco Strigifor… Aves
2 EHM039583 s__Par… 98.5 1.78 4472794 45 29681 243 Sciurus vul… Rodentia Mammalia
3 EHM048917 s__Par… 94.6 1.35 3837800 46 12164 428 Lepus europ… Lagomorpha Mammalia
4 EHM049446 s__Par… 90.1 0.6 3591722 46 12732 405 Lepus europ… Lagomorpha Mammalia
5 EHM070114 s__Par… 100.0 2.39 4728690 45 63727 110 Rattus ratt… Rodentia Mammalia
6 EHM075512 s__Par… 100.0 2.34 4690486 45 64909 112 Rattus ratt… Rodentia Mammalia
7 GCA_958416015.1 Paraba… 92.4 0.18 3902302 46.4 9412 497 <NA> <NA> <NA>
8 GCA_958447325.1 Paraba… 99.4 0.61 4709435 45.0 118754 66 <NA> <NA> <NA>
9 GCA_030844705.1 Paraba… 97.7 0.85 4586771 45.3 134207 49 <NA> <NA> <NA>
10 GCA_030839525.1 Paraba… 97.9 1.68 4512812 45.1 15103 417 <NA> <NA> <NA>
11 GCA_030839725.1 Paraba… 92.7 1.74 4293293 44.8 29168 238 <NA> <NA> <NA>
12 GCA_958435695.1 Paraba… 99.4 1.06 4635291 45.0 106588 71 <NA> <NA> <NA>
13 GCF_001405935.1 Paraba… 99.4 1.5 5099398 45.0 277147 45 <NA> <NA> <NA>
14 GCF_001406015.1 Paraba… 99.4 0.62 5073478 NA NA NA <NA> <NA> <NA>
# ℹ 14 more variables: assembly_type <chr>, isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>,
# country <chr>, extra_locality <chr>, source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>,
# mimag_medium_quality <lgl>, gtdb_representative <lgl>, ncbi_date <date>, continent <chr>
# A tibble: 18 × 25
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 EHM016228 s__Par… 97.3 0.92 4815605 45 3.27e4 224 Strix aluco Strigifor… Aves
2 EHM020917 s__Par… 90.3 1.73 3472909 45.6 1.19e4 414 Strix aluco Strigifor… Aves
3 EHM049769 s__Par… 95.2 1.01 4794833 45 6.88e4 107 Podarcis ga… Squamata Reptilia
4 EHM069263 s__Par… 100 0.74 5112462 45 1.46e5 97 Rattus ratt… Rodentia Mammalia
5 GCA_008680675.1 Paraba… 99.4 0.69 4962458 45.2 1.06e6 6 <NA> <NA> <NA>
6 GCA_022776965.1 Paraba… 97.3 0.26 4589485 45.3 1.64e5 44 <NA> <NA> <NA>
7 GCA_958413795.1 Paraba… 99.4 0.77 4948642 45.1 1.93e5 48 <NA> <NA> <NA>
8 GCA_959028115.1 Paraba… 99.4 0.51 4709637 45.1 1.35e5 54 <NA> <NA> <NA>
9 GCA_958416885.1 Paraba… 99.4 1.32 4704055 45.0 1.19e5 58 <NA> <NA> <NA>
10 GCA_958404515.1 Paraba… 99.4 0.93 5100500 44.9 1.25e5 61 <NA> <NA> <NA>
11 GCA_030843875.1 Paraba… 96.0 2.28 4410710 45.2 1.27e4 475 <NA> <NA> <NA>
12 GCA_009774835.1 Paraba… 99.2 0.89 4866783 45.1 9.64e4 71 <NA> <NA> <NA>
13 GCA_022784765.1 Paraba… 99.4 1.14 4816138 45.1 1.29e5 55 <NA> <NA> <NA>
14 GCA_959023275.1 Paraba… 99.4 0.73 4832844 45.0 1.39e5 54 <NA> <NA> <NA>
15 GCA_008679655.1 Paraba… 97.8 0.52 5036842 45.0 7.93e5 11 <NA> <NA> <NA>
16 GCF_003474765.1 Paraba… 99.4 1.07 5043355 45.1 1.75e5 83 <NA> <NA> <NA>
17 GCF_003459965.1 Paraba… 99.4 0.83 5151392 44.9 2.28e5 59 <NA> <NA> <NA>
18 GCF_029369685.1 Paraba… 99.4 1.46 5493583 45.1 5.39e6 2 <NA> <NA> <NA>
# ℹ 14 more variables: assembly_type <chr>, isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>,
# country <chr>, extra_locality <chr>, source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>,
# mimag_medium_quality <lgl>, gtdb_representative <lgl>, ncbi_date <date>, continent <chr>
# A tibble: 44 × 25
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 EHM016991 s__Par… 99.0 0.38 4748219 44.9 1.17e5 76 Podarcis pi… Squamata Reptilia
2 EHM023585 s__Par… 92.8 2.18 3889519 45.6 8.65e3 607 Sciurus vul… Rodentia Mammalia
3 EHM027637 s__Par… 93.5 2.05 4160022 45.3 1.28e4 526 Cathartes a… Accipitri… Aves
4 EHM031743 s__Par… 93.5 1.99 4260825 45 2.13e4 296 Canis famil… Carnivora Mammalia
5 EHM046460 s__Par… 99.5 1.66 4335884 45 3.87e4 181 Cathartes a… Accipitri… Aves
6 EHM047216 s__Par… 99.9 1.04 4517903 45 6.39e4 124 Lepus europ… Lagomorpha Mammalia
7 EHM064868 s__Par… 93.9 0.57 4141119 45 1.32e4 435 Lepus europ… Lagomorpha Mammalia
8 EHM067538 s__Par… 93.1 1.71 4147480 46 1.27e4 485 Lepus europ… Lagomorpha Mammalia
9 GCA_025757725.1 Paraba… 99.4 0.99 4848511 45.1 4.85e6 1 <NA> <NA> <NA>
10 GCA_008679285.1 Paraba… 95.8 1.54 4711604 45.1 3.08e5 21 <NA> <NA> <NA>
# ℹ 34 more rows
# ℹ 14 more variables: assembly_type <chr>, isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>,
# country <chr>, extra_locality <chr>, source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>,
# mimag_medium_quality <lgl>, gtdb_representative <lgl>, ncbi_date <date>, continent <chr>
# A tibble: 17 × 25
ID species completeness contamination genome_size GC N50 contigs host_species host_order host_class
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <chr>
1 EHM016318 s__Par… 93.6 1.81 4085923 45.3 7.43e3 685 Strix aluco Strigifor… Aves
2 EHM023781 s__Par… 93.5 1.92 4159834 45.6 1.68e4 369 Podarcis ga… Squamata Reptilia
3 EHM028668 s__Par… 100.0 0.87 4684324 45 7.38e4 107 Timon lepid… Squamata Reptilia
4 EHM029564 s__Par… 95.9 1.5 4476906 45 4.19e4 152 Timon lepid… Squamata Reptilia
5 EHM048790 s__Par… 100.0 2.23 4479865 45 5.73e4 130 Lepus europ… Lagomorpha Mammalia
6 EHM049533 s__Par… 100.0 0.65 5203783 45 1.24e5 64 Podarcis fi… Squamata Reptilia
7 EHM072417 s__Par… 93.8 0.7 4365524 45 4.85e4 145 Timon lepid… Squamata Reptilia
8 GCA_959026225.1 Paraba… 99.4 0.54 4672770 45.1 9.15e4 77 <NA> <NA> <NA>
9 GCF_018587995.1 Paraba… 99.4 0.27 5066235 45.0 3.04e5 80 <NA> <NA> <NA>
10 GCF_018784125.1 Paraba… 99.4 1.84 4988964 45.1 1.72e5 112 <NA> <NA> <NA>
11 GCF_018289335.1 Paraba… 99.4 0.71 5311305 45.2 5.20e6 3 <NA> <NA> <NA>
12 GCF_022835335.1 Paraba… 93.6 1.43 4806077 45.0 2.76e6 26 <NA> <NA> <NA>
13 GCF_022835355.1 Paraba… 93.6 1.46 4803375 45.0 2.76e6 29 <NA> <NA> <NA>
14 GCF_003439415.1 Paraba… 99.4 1.02 5294086 45.1 2.14e5 81 <NA> <NA> <NA>
15 GCF_019733675.1 Paraba… 99.4 1.49 5285880 45.2 3.21e5 33 <NA> <NA> <NA>
16 GCF_003475725.1 Paraba… 99.4 1.84 5176159 45.0 1.12e5 111 <NA> <NA> <NA>
17 GCF_003464475.1 Paraba… 99.4 1.9 5187286 44.9 2.09e5 72 <NA> <NA> <NA>
# ℹ 14 more variables: assembly_type <chr>, isolation_source <chr>, mag_name <chr>, eha_number <chr>, locality <chr>,
# country <chr>, extra_locality <chr>, source <chr>, gtdb_taxonomy <chr>, mimag_high_quality <lgl>,
# mimag_medium_quality <lgl>, gtdb_representative <lgl>, ncbi_date <date>, continent <chr>
metadata_with_cluster <- genome_metadata %>%
left_join(coords %>% dplyr::select(MAG, cluster), by = c("ID" = "MAG"))
metadata_with_cluster %>%
group_by(cluster, source) %>%
summarise(n = n(), .groups = "drop") %>%
arrange(cluster, desc(n))# A tibble: 8 × 3
cluster source n
<fct> <chr> <int>
1 1 GTDB 8
2 1 EHI 6
3 2 GTDB 14
4 2 EHI 4
5 3 GTDB 36
6 3 EHI 8
7 4 GTDB 10
8 4 EHI 7
# Group summaries
metadata_with_cluster %>%
group_by(cluster) %>%
summarise(mean_genome_size = mean(genome_size, na.rm = TRUE),
median_contamination = median(contamination, na.rm = TRUE),
.groups = "drop")# A tibble: 4 × 3
cluster mean_genome_size median_contamination
<fct> <dbl> <dbl>
1 1 4417206. 1.26
2 2 4825680. 0.905
3 3 4708235. 1.04
4 4 4826370. 1.46
save(ehi_mags,
phylum_colors,
genome_annotations,
genome_gifts,
contig_to_genome,
gtdb_metadata,
ehi_metadata,
master_index,
genome_metadata,
source_colors,
getphylo_tree,
metadata_with_cluster,
file = "data/data.Rdata")Is the genome size different between clusters?
Kruskal-Wallis rank sum test
data: genome_size by cluster
Kruskal-Wallis chi-squared = 8.0272, df = 3, p-value = 0.04545
pairwise.wilcox.test(
x = metadata_with_cluster$genome_size,
g = metadata_with_cluster$cluster,
p.adjust.method = "BH" # FDR correction
)
Pairwise comparisons using Wilcoxon rank sum exact test
data: metadata_with_cluster$genome_size and metadata_with_cluster$cluster
1 2 3
2 0.032 - -
3 0.165 0.288 -
4 0.078 1.000 0.382
P value adjustment method: BH
Plots
ggplot(metadata_with_cluster, aes(x = genome_size, y = GC, col = cluster))+
geom_point()+
theme_bw()Warning: Removed 2 rows containing missing values or values outside the scale range (`geom_point()`).
ggplot(metadata_with_cluster, aes(x = assembly_type, y = source, col = cluster))+
geom_point()+
theme_bw()ggplot(metadata_with_cluster, aes(x = cluster, y = genome_size, col = cluster))+
geom_point()+
theme_bw()#Aggregate bundle-level GIFTs into the compound level
GIFTs_elements <- to.elements(genome_gifts,GIFT_db)
#Aggregate element-level GIFTs into the function level
GIFTs_functions <- to.functions(GIFTs_elements,GIFT_db)
#Aggregate function-level GIFTs into overall Biosynthesis, Degradation and Structural GIFTs
GIFTs_domains <- to.domains(GIFTs_functions,GIFT_db)GIFTs_elements %>%
as.data.frame() %>%
rownames_to_column(var="MAG")%>%
pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
mutate(functionid = substr(trait, 1, 3)) %>%
mutate(trait = case_when(
trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
TRUE ~ trait
)) %>%
mutate(functionid = case_when(
functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
TRUE ~ functionid
)) %>%
mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
ggplot(aes(x=MAG,y=trait,fill=gift)) +
geom_tile(colour="white", linewidth=0.2)+
scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
facet_grid(functionid ~ cluster, scales="free",space="free") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.text.y = element_text(size=6),
strip.text.y = element_text(angle = 0)
) +
labs(y="Traits",x="Samples",fill="GIFT")GIFTs_elements %>%
as.data.frame() %>%
rownames_to_column(var="MAG")%>%
pivot_longer(!MAG,names_to="trait",values_to="gift") %>%
left_join(metadata_with_cluster, by = join_by(MAG == ID)) %>%
mutate(functionid = substr(trait, 1, 3)) %>%
mutate(trait = case_when(
trait %in% GIFT_db$Code_element ~ GIFT_db$Element[match(trait, GIFT_db$Code_element)],
TRUE ~ trait
)) %>%
mutate(functionid = case_when(
functionid %in% GIFT_db$Code_function ~ GIFT_db$Function[match(functionid, GIFT_db$Code_function)],
TRUE ~ functionid
)) %>%
mutate(trait=factor(trait,levels=unique(GIFT_db$Element))) %>%
mutate(functionid=factor(functionid,levels=unique(GIFT_db$Function))) %>%
ggplot(aes(x=MAG,y=trait,fill=gift)) +
geom_tile(colour="white", linewidth=0.2)+
scale_fill_gradientn(colours=rev(c("#d53e4f", "#f46d43", "#fdae61", "#fee08b", "#e6f598", "#abdda4", "#ddf1da")))+
facet_grid(functionid ~ source, scales="free",space="free") +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
axis.text.y = element_text(size=6),
strip.text.y = element_text(angle = 0)
) +
labs(y="Traits",x="Samples",fill="GIFT")Warning: package 'reshape2' was built under R version 4.4.3
Adjuntando el paquete: 'reshape2'
The following object is masked from 'package:tidyr':
smiths
GIFTs_elements %>%
reshape2::melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db,by="Code_element") %>%
ggplot(., aes(x=Code_element, y=Genome, fill=GIFT, group=Code_function))+
geom_tile()+
scale_y_discrete(guide = guide_axis(check.overlap = TRUE))+
scale_x_discrete(guide = guide_axis(check.overlap = TRUE))+
scale_fill_gradientn(limits = c(0,1), colours=brewer.pal(7, "YlGnBu"))+
facet_grid(. ~ Code_function, scales = "free", space = "free")+
theme_grey(base_size=8)+
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),strip.text.x = element_text(angle = 90))Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
Code_bundle Code_element Code_function Domain Function Element
1 B010101 B0101 B01 Biosynthesis Nucleic acid biosynthesis Inosinic acid (IMP)
2 B010201 B0102 B01 Biosynthesis Nucleic acid biosynthesis Uridylic acid (UMP)
3 B010301 B0103 B01 Biosynthesis Nucleic acid biosynthesis UDP/UTP
4 B010401 B0104 B01 Biosynthesis Nucleic acid biosynthesis CDP/CTP
5 B010501 B0105 B01 Biosynthesis Nucleic acid biosynthesis ADP/ATP
6 B010601 B0106 B01 Biosynthesis Nucleic acid biosynthesis GDP/GTP
Definition
1 K00764 (K01945,K11787,K11788,K13713) (K00601,K11175,K08289,K11787,K01492) (K01952,(K23269+K23264+K23265),(K23270+K23265)) (K01933,K11787,(K11788 (K01587,K11808,(K01589 K01588)))) (K01923,K01587,K13713) K01756 (K00602,(K01492,(K06863 K11176)))
2 (K11540,((K11541 K01465),((K01954,(K01955+K01956)) ((K00609+K00610),K00608) K01465))) (K00226,K00254,K17828) (K13421,(K00762 K01591))
3 (K13800,K13809,K09903)
4 (K00940,K18533) K01937
5 K01939 K01756 (K00939,K18532,K18533,K00944) K00940
6 K00088 K01951 K00942 (K00940,K18533)
Code_bundle Code_element Code_function Domain Function Element
1 D090101 D0901 D09 Degradation Antibiotic degradation Penicillin
2 D090201 D0902 D09 Degradation Antibiotic degradation Carbapenem
3 D090301 D0903 D09 Degradation Antibiotic degradation Cephalosporin
4 D090401 D0904 D09 Degradation Antibiotic degradation Oxacillin
5 D090501 D0905 D09 Degradation Antibiotic degradation Streptogramin
6 D090601 D0906 D09 Degradation Antibiotic degradation Fosfomycin
7 D090701 D0907 D09 Degradation Antibiotic degradation Tetracycline
8 D090801 D0908 D09 Degradation Antibiotic degradation Macrolide
9 D091001 D0910 D09 Degradation Antibiotic degradation Chloramphenicol
10 D091101 D0911 D09 Degradation Antibiotic degradation Lincosamide
11 D091201 D0912 D09 Degradation Antibiotic degradation Streptothricin
Definition
1 K18698,K18699,K18796,K18767,K18797,K19097,K19317,K18768,K18970,K19316,K22346,K18795,K19218,K19217,K17836,K18766
2 K17837,K18782,K18781,K18780,K19099,K19216
3 K19095,K19096,K19100,K19101,K19214,K19215,K20319,K20320,K01467
4 K17838,K18790,K18791,K19098,K18792,K19213,K21276,K18793,K18971,K22352,K19209,K18976,K18973,K18794,K18972,K21277,K19210,K19211,K19212,K22335,K19319,K22331,K22351,K19320,K19318,K19321,K19322,K21266,K22334,K22333,K22332
5 K19349,K19350
6 K21252
7 K08151,K08168,K18214,K18218,K18220,K18221
8 K06979,K08217,K18230,K18231,K21251
9 K00638,K08160,K18552,K18553,K18554,K19271
10 K18236,K19349,K19350,K19545
11 K19273,K20816
library(reshape2)
library(RColorBrewer)
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>% # Map Code_element -> Element & Function
filter(str_starts(Code_element, "D09")) %>% # Only antibiotic degradation
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
# Map Code_element IDs to trait names
mutate(trait = case_when(
Code_element %in% GIFT_db$Code_element ~ GIFT_db$Element[match(Code_element, GIFT_db$Code_element)],
TRUE ~ Code_element
)) %>%
# Map function IDs for faceting if desired
mutate(functionid = case_when(
substr(Code_element,1,3) %in% GIFT_db$Code_function ~ GIFT_db$Function[match(substr(Code_element,1,3), GIFT_db$Code_function)],
TRUE ~ substr(Code_element,1,3)
)) %>%
mutate(trait = factor(trait, levels = unique(GIFT_db$Element))) %>%
ggplot(aes(x = trait, y = Genome, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = brewer.pal(7, "YlGnBu")) +
facet_wrap(~ cluster, scales = "free_y", ncol = 1) +
theme_grey(base_size = 8) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
strip.text.x = element_text(angle = 0)
) +
labs(x = "Trait", fill = "GIFT")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(str_starts(Code_element, "D09")) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
# Ensure trait is a factor to maintain order
mutate(trait = factor(trait, levels = rev(unique(GIFT_db$Element)))) %>%
ggplot( aes(x = Genome, y = trait)) +
# Heatmap
geom_tile(aes(fill = GIFT), color = "white") +
scale_fill_gradientn(colours = brewer.pal(7, "YlGnBu"), name = "GIFT Score") +
# Start a new fill scale for the Source bar
new_scale_fill() +
# Add the source bar at the very top or bottom
geom_tile(aes(y = -0.5, fill = source), height = 0.5) +
scale_fill_manual(values = source_colors) +
facet_grid(~ cluster, scales = "free_x", space = "free_x") +
theme_minimal(base_size = 8) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
9.3.2 Checking difference in GIFTs
9.3.2.1 Cluster-wise
Element GIFTs different between clusters of pcoa
# Get only the GIFT columns
gift_cols <- colnames(GIFTs_elements)[!(colnames(GIFTs_elements) %in% c("MAG_ID","cluster","source"))]
gift_dataframe <- GIFTs_elements %>%
as.data.frame() %>%
rownames_to_column(var = "ID")
gift_df_meta <- gift_dataframe %>%
left_join(metadata_with_cluster, by = "ID")
# Kruskal-Wallis for 4 groups
kruskal_results <- sapply(gift_cols, function(g) {
kruskal.test(as.formula(paste(g, "~ cluster")), data = gift_df_meta)$p.value
})
# Adjust for multiple testing
kruskal_results_adj <- p.adjust(kruskal_results, method = "BH")
# Combine into a table
kruskal_table <- data.frame(
GIFT = gift_cols,
p_value = kruskal_results,
p_adj = kruskal_results_adj
)
kruskal_table %>% filter(p_adj<0.05) GIFT p_value p_adj
B0216 B0216 8.964986e-05 1.255098e-03
B0705 B0705 2.448055e-05 4.569703e-04
D0901 D0901 8.145800e-20 2.280824e-18
D0908 D0908 8.145800e-20 2.280824e-18
pairwise_results <- lapply(gift_cols, function(g) {
pairwise.wilcox.test(
x = gift_df_meta[[g]],
g = gift_df_meta$cluster,
p.adjust.method = "BH"
)
})
names(pairwise_results) <- gift_cols
pairwise_sig_table <- lapply(names(pairwise_results), function(g) {
# Extract p-value matrix
pmat <- pairwise_results[[g]]$p.value
# Convert to long format
pmat_long <- as.data.frame(as.table(pmat)) %>%
filter(!is.na(Freq)) %>% # remove NA (diagonal / upper triangle)
dplyr::rename(group1 = Var1, group2 = Var2, p_adj = Freq) %>%
filter(p_adj < 0.05) %>%
mutate(GIFT = g)
return(pmat_long)
})
# Combine all GIFTs into one table
pairwise_sig_table <- bind_rows(pairwise_sig_table) %>%
dplyr::select(GIFT, group1, group2, p_adj)
pairwise_sig_table GIFT group1 group2 p_adj
1 B0105 3 1 3.722745e-02
2 B0216 4 1 1.895970e-02
3 B0216 3 2 1.740565e-03
4 B0216 4 2 3.161198e-04
5 B0705 3 1 4.396047e-03
6 B0705 3 2 2.247836e-05
7 B0705 4 2 9.897753e-03
8 B0706 3 1 3.722745e-02
9 D0901 2 1 3.904407e-08
10 D0901 4 1 4.919048e-08
11 D0901 3 2 2.058151e-14
12 D0901 4 3 2.058151e-14
13 D0908 3 1 9.575590e-14
14 D0908 4 1 4.919048e-08
15 D0908 3 2 2.468680e-14
16 D0908 4 2 8.235871e-09
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(Code_element %in% pairwise_sig_table$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
new_scale_fill() +
# Add the source bar at the very top or bottom
geom_tile(aes(y = -0.5, fill = source), height = 0.3) +
scale_fill_manual(values = source_colors) +
facet_grid(~ cluster, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1, size = 4),
strip.text.x = element_text(face = "bold")
) +
labs(y = "Trait", x = "Genome", fill = "GIFT")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
Code_bundle Code_element Code_function Domain Function Element
1 B010501 B0105 B01 Biosynthesis Nucleic acid biosynthesis ADP/ATP
2 B021601 B0216 B02 Biosynthesis Amino acid biosynthesis Tryptophan
3 B070501 B0705 B07 Biosynthesis Vitamin biosynthesis Pyridoxal-P (B6)
4 B070502 B0705 B07 Biosynthesis Vitamin biosynthesis Pyridoxal-P (B6)
5 B070601 B0706 B07 Biosynthesis Vitamin biosynthesis Biotin (B7)
6 B070602 B0706 B07 Biosynthesis Vitamin biosynthesis Biotin (B7)
7 B070603 B0706 B07 Biosynthesis Vitamin biosynthesis Biotin (B7)
8 B070604 B0706 B07 Biosynthesis Vitamin biosynthesis Biotin (B7)
9 D090101 D0901 D09 Degradation Antibiotic degradation Penicillin
10 D090801 D0908 D09 Degradation Antibiotic degradation Macrolide
Definition
1 K01939 K01756 (K00939,K18532,K18533,K00944) K00940
2 ((((K01657+K01658),K13503,K13501,K01656) K00766),K13497) (((K01817,K24017) (K01656,K01609)),K13498,K13501) ((K01695+(K01696,K06001)),K01694)
3 K03472 K03473 K00831 K00097 K03474 K00275
4 K06215 K08681
5 K00652 (((K00833,K19563) K01935),K19562) K01012
6 K00652 K25570 K01935 K01012
7 K16593 K00652 K19563 K01935 K01012
8 K01906 K00652 (K00833,K19563) K01935 K01012
9 K18698,K18699,K18796,K18767,K18797,K19097,K19317,K18768,K18970,K19316,K22346,K18795,K19218,K19217,K17836,K18766
10 K06979,K08217,K18230,K18231,K21251
library(rstatix)
library(dplyr)
library(purrr)
pairwise_sig_table <- map_df(gift_cols, function(g) {
# Check if there is variation in the GIFT values
# If all values are the same, skip this GIFT
if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
# Run the test safely
tryCatch({
results <- gift_df_meta %>%
wilcox_test(as.formula(paste0("`", g, "` ~ cluster")), p.adjust.method = "BH")
eff <- gift_df_meta %>%
wilcox_effsize(as.formula(paste0("`", g, "` ~ cluster")))
results %>%
left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
filter(p.adj < 0.05) %>%
mutate(GIFT = g)
}, error = function(e) return(NULL)) # If it still fails, just skip it
})
top_gifts <- pairwise_sig_table %>%
arrange(desc(effsize))
top_gifts# A tibble: 6 × 12
.y. group1 group2 n1 n2 statistic p p.adj p.adj.signif effsize magnitude GIFT
<chr> <chr> <chr> <int> <int> <dbl> <dbl> <dbl> <chr> <dbl> <ord> <chr>
1 B0216 2 4 18 17 258. 0.0000527 0.000316 *** 0.687 large B0216
2 B0705 2 3 18 44 151 0.00000375 0.0000225 **** 0.589 large B0705
3 B0705 2 4 18 17 79 0.005 0.01 ** 0.478 moderate B0705
4 B0216 1 4 14 17 179 0.009 0.019 * 0.470 moderate B0216
5 B0216 2 3 18 44 583 0.00058 0.002 ** 0.438 moderate B0216
6 B0705 1 3 14 44 174 0.001 0.004 ** 0.419 moderate B0705
library(reshape2)
library(RColorBrewer)
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
# Only keep significant traits
filter(Code_element %in% pairwise_sig_table$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>% # human-readable trait
ggplot(aes(x = trait, y = Genome, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
facet_wrap(~ cluster, scales = "free_y", ncol = 1) +
theme_grey(base_size = 8) +
theme(
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
strip.text.x = element_text(angle = 0)
) +
labs(x = "Trait", fill = "GIFT")GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(Code_element %in% pairwise_sig_table$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
facet_grid(~ cluster, scales = "free_x", space = "free_x") +
theme_grey(base_size = 8) +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
strip.text.x = element_text(face = "bold")
) +
labs(y = "Trait", x = "Genome", fill = "GIFT")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.
9.3.2.2 Source-wise
GIFTs that are different between EHI and GTDB
wilcox_results <- sapply(gift_cols, function(g) {
wilcox.test(as.formula(paste(g, "~ source")), data = gift_df_meta)$p.value
})
wilcox_results_adj <- p.adjust(wilcox_results, method = "BH")
data.frame(
GIFT = gift_cols,
p_value = wilcox_results,
p_adj = wilcox_results_adj
) GIFT p_value p_adj
B0101 B0101 0.0056603987 0.07924558
B0102 B0102 0.5605090914 0.58126869
B0103 B0103 NaN NaN
B0104 B0104 0.5605090914 0.58126869
B0105 B0105 0.4795942525 0.56798830
B0106 B0106 0.2222308172 0.41483086
B0204 B0204 0.5605090914 0.58126869
B0205 B0205 0.3981400285 0.55739604
B0206 B0206 0.1041649692 0.22435532
B0207 B0207 0.2946362754 0.47167436
B0208 B0208 NaN NaN
B0209 B0209 0.4689178506 0.56798830
B0210 B0210 0.1041649692 0.22435532
B0211 B0211 0.4689178506 0.56798830
B0212 B0212 0.0199165012 0.11157331
B0213 B0213 NaN NaN
B0214 B0214 0.5605090914 0.58126869
B0215 B0215 0.0008357308 0.02341747
B0216 B0216 0.4388035480 0.56798830
B0217 B0217 0.1041649692 0.22435532
B0218 B0218 NaN NaN
B0219 B0219 0.0631194694 0.22435532
B0220 B0220 0.1166255893 0.23890272
B0221 B0221 0.3054108219 0.47508350
B0302 B0302 NaN NaN
B0303 B0303 0.1041649692 0.22435532
B0307 B0307 0.0024550065 0.04582679
B0309 B0309 0.1041649692 0.22435532
B0310 B0310 NaN NaN
B0401 B0401 NaN NaN
B0402 B0402 NaN NaN
B0403 B0403 NaN NaN
B0601 B0601 0.1041649692 0.22435532
B0602 B0602 0.1041649692 0.22435532
B0603 B0603 0.1194513584 0.23890272
B0604 B0604 NaN NaN
B0605 B0605 NaN NaN
B0701 B0701 NaN NaN
B0702 B0702 0.3480882112 0.52683621
B0703 B0703 NaN NaN
B0704 B0704 0.3981400285 0.55739604
B0705 B0705 0.2602709220 0.47016683
B0706 B0706 0.0199238062 0.11157331
B0707 B0707 0.0199238062 0.11157331
B0708 B0708 0.9410448183 0.94104482
B0709 B0709 0.1041649692 0.22435532
B0710 B0710 0.0220561153 0.11228568
B0711 B0711 0.0101037961 0.11157331
B0712 B0712 0.1041649692 0.22435532
B0801 B0801 NaN NaN
B0802 B0802 NaN NaN
B0803 B0803 NaN NaN
B0804 B0804 NaN NaN
B0805 B0805 NaN NaN
B0901 B0901 NaN NaN
B0902 B0902 NaN NaN
B0903 B0903 NaN NaN
B1004 B1004 NaN NaN
B1006 B1006 NaN NaN
B1008 B1008 NaN NaN
B1011 B1011 NaN NaN
B1012 B1012 NaN NaN
B1014 B1014 NaN NaN
B1021 B1021 NaN NaN
B1022 B1022 NaN NaN
B1024 B1024 NaN NaN
B1026 B1026 0.0345084745 0.16103955
B1028 B1028 NaN NaN
B1029 B1029 NaN NaN
B1041 B1041 NaN NaN
B1042 B1042 NaN NaN
D0101 D0101 0.0199165012 0.11157331
D0102 D0102 NaN NaN
D0103 D0103 NaN NaN
D0104 D0104 NaN NaN
D0201 D0201 NaN NaN
D0202 D0202 NaN NaN
D0203 D0203 NaN NaN
D0204 D0204 NaN NaN
D0205 D0205 NaN NaN
D0206 D0206 NaN NaN
D0207 D0207 NaN NaN
D0208 D0208 NaN NaN
D0209 D0209 NaN NaN
D0210 D0210 NaN NaN
D0211 D0211 NaN NaN
D0212 D0212 NaN NaN
D0213 D0213 NaN NaN
D0301 D0301 NaN NaN
D0302 D0302 NaN NaN
D0303 D0303 NaN NaN
D0304 D0304 NaN NaN
D0305 D0305 NaN NaN
D0306 D0306 NaN NaN
D0307 D0307 NaN NaN
D0308 D0308 NaN NaN
D0309 D0309 NaN NaN
D0310 D0310 NaN NaN
D0401 D0401 NaN NaN
D0402 D0402 NaN NaN
D0403 D0403 NaN NaN
D0404 D0404 NaN NaN
D0405 D0405 NaN NaN
D0406 D0406 NaN NaN
D0407 D0407 NaN NaN
D0408 D0408 NaN NaN
D0501 D0501 NaN NaN
D0502 D0502 NaN NaN
D0503 D0503 NaN NaN
D0504 D0504 NaN NaN
D0505 D0505 NaN NaN
D0506 D0506 NaN NaN
D0507 D0507 0.0631194694 0.22435532
D0508 D0508 NaN NaN
D0509 D0509 0.1041649692 0.22435532
D0510 D0510 NaN NaN
D0511 D0511 NaN NaN
D0512 D0512 0.5605090914 0.58126869
D0513 D0513 0.2946362754 0.47167436
D0516 D0516 NaN NaN
D0517 D0517 NaN NaN
D0518 D0518 NaN NaN
D0601 D0601 0.0008363381 0.02341747
D0602 D0602 NaN NaN
D0603 D0603 NaN NaN
D0604 D0604 NaN NaN
D0606 D0606 NaN NaN
D0607 D0607 NaN NaN
D0608 D0608 NaN NaN
D0609 D0609 NaN NaN
D0610 D0610 NaN NaN
D0611 D0611 NaN NaN
D0612 D0612 NaN NaN
D0613 D0613 NaN NaN
D0701 D0701 NaN NaN
D0702 D0702 NaN NaN
D0704 D0704 NaN NaN
D0705 D0705 NaN NaN
D0706 D0706 NaN NaN
D0708 D0708 NaN NaN
D0709 D0709 NaN NaN
D0801 D0801 NaN NaN
D0802 D0802 NaN NaN
D0804 D0804 NaN NaN
D0805 D0805 NaN NaN
D0806 D0806 NaN NaN
D0807 D0807 0.8098046490 0.82452837
D0808 D0808 NaN NaN
D0809 D0809 NaN NaN
D0810 D0810 NaN NaN
D0811 D0811 NaN NaN
D0812 D0812 NaN NaN
D0813 D0813 NaN NaN
D0814 D0814 NaN NaN
D0815 D0815 NaN NaN
D0816 D0816 NaN NaN
D0817 D0817 NaN NaN
D0818 D0818 NaN NaN
D0819 D0819 NaN NaN
D0901 D0901 0.4478557006 0.56798830
D0902 D0902 0.4689178506 0.56798830
D0903 D0903 NaN NaN
D0904 D0904 0.3981400285 0.55739604
D0905 D0905 NaN NaN
D0906 D0906 NaN NaN
D0907 D0907 0.4689178506 0.56798830
D0908 D0908 0.4969897664 0.56798830
D0910 D0910 0.0199165012 0.11157331
D0911 D0911 0.2947964749 0.47167436
D0912 D0912 0.1041649692 0.22435532
S0101 S0101 NaN NaN
S0103 S0103 NaN NaN
S0104 S0104 NaN NaN
S0105 S0105 0.2947964749 0.47167436
S0201 S0201 0.0707191711 0.22435532
S0202 S0202 0.4954727815 0.56798830
S0301 S0301 0.1247781764 0.24095096
effect_size <- sapply(gift_cols, function(g) {
median(gift_df_meta[[g]][gift_df_meta$source=="GTDB"]) -
median(gift_df_meta[[g]][gift_df_meta$source=="EHI"])
})
wilcox_res_source <- data.frame(
GIFT = gift_cols,
p_value = wilcox_results,
p_adj = wilcox_results_adj,
effect = effect_size
)
wilcox_res_source %>% dplyr::select(GIFT, p_adj, effect) %>% filter(p_adj<0.05) GIFT p_adj effect
B0215 B0215 0.02341747 0
B0307 B0307 0.04582679 0
D0601 D0601 0.02341747 0
Code_bundle Code_element Code_function Domain Function Element
1 B021501 B0215 B02 Biosynthesis Amino acid biosynthesis Histidine
2 B030701 B0307 B03 Biosynthesis Amino acid derivative biosynthesis Spermidine
3 D060101 D0601 D06 Degradation Nitrogen compound degradation Nitrate
4 D060102 D0601 D06 Degradation Nitrogen compound degradation Nitrate
5 D060103 D0601 D06 Degradation Nitrogen compound degradation Nitrate
6 D060105 D0601 D06 Degradation Nitrogen compound degradation Nitrate
Definition
1 K00765 ((K01523 K01496),K11755,K14152) (K01814,K24017) ((K02501+K02500),K01663) ((K01693 K00817 (K04486,K05602,K18649)),(K01089 K00817)) (K00013,K14152)
2 (K01583,K01584,K01585,K02626) K01480
3 ((K00370+K00371+K00374),(K02567+K02568)) ((K00362+K00363),(K03385+K15876))
4 1.7.5.1 1.7.2.1 1.7.2.5 1.7.2.4
5 1.9.6.1 1.7.2.2
6 1.7.7.2 1.7.7.1
wilcox_res_source <- map_df(gift_cols, function(g) {
if(length(unique(gift_df_meta[[g]])) < 2) return(NULL)
tryCatch({
# Calculate p-value
res <- gift_df_meta %>%
wilcox_test(as.formula(paste0("`", g, "` ~ source")))
# Calculate effect size (Rank-Biserial Correlation)
eff <- gift_df_meta %>%
wilcox_effsize(as.formula(paste0("`", g, "` ~ source")))
# Combine results
res %>%
left_join(eff, by = c("group1", "group2", ".y." = ".y.", "n1", "n2")) %>%
mutate(GIFT = g)
}, error = function(e) return(NULL))
})
# Adjust P-values and filter
wilcox_res_source <- wilcox_res_source %>%
mutate(p_adj = p.adjust(p, method = "BH")) %>%
dplyr::select(GIFT, group1, group2, p_adj, effsize, magnitude) %>%
filter(p_adj < 0.05) %>%
arrange(desc(abs(effsize))) # Sort by strongest effect
# View the top results
print(wilcox_res_source)# A tibble: 3 × 6
GIFT group1 group2 p_adj effsize magnitude
<chr> <chr> <chr> <dbl> <dbl> <ord>
1 B0215 EHI GTDB 0.0234 0.348 moderate
2 D0601 EHI GTDB 0.0234 0.348 moderate
3 B0307 EHI GTDB 0.0459 0.315 moderate
GIFTs_elements %>%
melt() %>%
dplyr::rename(Genome = Var1, Code_element = Var2, GIFT = value) %>%
inner_join(GIFT_db, by = "Code_element") %>%
filter(Code_element %in% wilcox_res_source$GIFT) %>%
left_join(metadata_with_cluster, by = join_by(Genome == ID)) %>%
distinct(Genome, Code_element, .keep_all = TRUE) %>%
mutate(cluster = factor(cluster)) %>%
droplevels() %>%
arrange(cluster, Genome) %>%
mutate(trait = Element) %>%
ggplot(aes(x = Genome, y = trait, fill = GIFT)) +
geom_tile(colour = "white", linewidth = 0.2) +
scale_y_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_x_discrete(guide = guide_axis(check.overlap = TRUE)) +
scale_fill_gradientn(limits = c(0,1), colours = RColorBrewer::brewer.pal(7, "YlGnBu")) +
new_scale_fill() +
# Add the source bar at the very top or bottom
geom_tile(aes(y = -0.5, fill = source), height = 0.5) +
scale_fill_manual(values = source_colors) +
facet_grid(~ source, scales = "free_x", space = "free_x") +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
strip.text.x = element_text(face = "bold")
) +
labs(y = "Trait", x = "Genome", fill = "GIFT")Warning in inner_join(., GIFT_db, by = "Code_element"): Detected an unexpected many-to-many relationship between `x` and `y`.
ℹ Row 745 of `x` matches multiple rows in `y`.
ℹ Row 1 of `y` matches multiple rows in `x`.
ℹ If a many-to-many relationship is expected, set `relationship = "many-to-many"` to silence this warning.